import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from umap.umap_ import UMAP
import plotly.express as px
# PCA
from sklearn.decomposition import PCA
# Library for t-SNE projections
from sklearn.manifold import TSNE
meta = pd.read_csv('lgg_lb_meta.csv')
meta = meta.set_index(['SDG_ID'])
#meta
df = pd.read_csv('lb_plasma_matrix.csv')
tdf = df.T
tdf.columns = tdf.iloc[0]
tdf = tdf[1:]
#tdf
main_df = pd.concat([meta, tdf], axis=1, join="inner")
#main_df
short_histology_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Tumor_Subtype', 'Relapse', 'Survival_Status'],axis=1)
short_histology_df
| Short_Histology | CTRL_ANT1 | CTRL_ANT2 | CTRL_ANT3 | CTRL_ANT4 | CTRL_ANT5 | CTRL_miR_POS | HK_ACTB | HK_B2M | HK_GAPDH | ... | miR-944 | miR-95-3p | miR-95-5p | miR-9-5p | miR-96-3p | miR-96-5p | miR-98-3p | miR-99a-5p | miR-99b-3p | miR-99b-5p | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15635-37 | LGG | 16 | 16 | 22 | 25 | 5 | 20225 | 27 | 109 | 6245 | ... | 23 | 44 | 31 | 19 | 21 | 125 | 38 | 891 | 75 | 290 |
| 15635-43 | HGG | 35 | 60 | 49 | 60 | 47 | 51017 | 30 | 103 | 12707 | ... | 44 | 83 | 76 | 37 | 37 | 95 | 96 | 1876 | 78 | 662 |
| 15635-45 | HGG | 122 | 139 | 145 | 95 | 127 | 16810 | 145 | 1535 | 3414 | ... | 79 | 179 | 212 | 179 | 113 | 430 | 111 | 1141 | 102 | 1034 |
| 15635-46 | LGG | 41 | 60 | 99 | 68 | 49 | 58350 | 95 | 239 | 29270 | ... | 11 | 99 | 172 | 67 | 35 | 62 | 118 | 1972 | 143 | 596 |
| 15635-53 | HGG | 6 | 20 | 3 | 27 | 5 | 20365 | 39 | 302 | 6065 | ... | 13 | 51 | 27 | 7 | 20 | 316 | 26 | 2028 | 110 | 540 |
| 15635-60 | LGG | 12 | 68 | 43 | 102 | 35 | 87035 | 58 | 199 | 19675 | ... | 53 | 98 | 127 | 28 | 104 | 20 | 123 | 1020 | 174 | 370 |
| 15635-68 | LGG | 99 | 86 | 36 | 99 | 60 | 39440 | 79 | 748 | 15361 | ... | 61 | 190 | 162 | 98 | 75 | 256 | 67 | 1842 | 168 | 1263 |
| 15635-80 | LGG | 56 | 87 | 97 | 461 | 47 | 7490 | 77 | 150 | 17922 | ... | 68 | 82 | 142 | 65 | 55 | 263 | 37 | 377 | 217 | 960 |
| 15635-87 | HGG | 76 | 77 | 97 | 88 | 98 | 44954 | 45 | 262 | 4624 | ... | 117 | 201 | 202 | 47 | 65 | 297 | 55 | 3270 | 248 | 896 |
| 15635-90 | LGG | 69 | 52 | 76 | 110 | 68 | 20213 | 67 | 175 | 5244 | ... | 40 | 84 | 56 | 81 | 73 | 168 | 46 | 3492 | 155 | 282 |
| 15635-100 | LGG | 0 | 17 | 26 | 20 | 18 | 38373 | 195 | 3374 | 17375 | ... | 0 | 57 | 62 | 24 | 22 | 120 | 72 | 3586 | 73 | 919 |
| 15635-101 | LGG | 39 | 30 | 21 | 33 | 23 | 19047 | 17 | 86 | 8125 | ... | 39 | 62 | 92 | 39 | 27 | 126 | 33 | 1508 | 64 | 254 |
| 15635-127 | LGG | 16 | 6 | 19 | 73 | 16 | 24390 | 33 | 304 | 8055 | ... | 7 | 28 | 50 | 38 | 10 | 113 | 27 | 865 | 44 | 335 |
| 15635-134 | LGG | 28 | 50 | 19 | 64 | 4 | 47361 | 302 | 1665 | 12094 | ... | 60 | 147 | 104 | 40 | 31 | 180 | 75 | 2621 | 154 | 1349 |
| 15635-154 | HGG | 9 | 30 | 3 | 26 | 11 | 22451 | 5 | 55 | 2794 | ... | 8 | 92 | 65 | 15 | 52 | 259 | 29 | 985 | 69 | 467 |
| 15635-156 | HGG | 45 | 14 | 36 | 15 | 23 | 37931 | 75 | 837 | 2349 | ... | 46 | 220 | 67 | 0 | 0 | 307 | 40 | 8387 | 128 | 1737 |
| 15635-239 | HGG | 24 | 24 | 77 | 92 | 24 | 23781 | 105 | 223 | 306 | ... | 44 | 110 | 94 | 49 | 27 | 340 | 53 | 2902 | 189 | 823 |
17 rows × 2103 columns
# Store histologies
hist = short_histology_df['Short_Histology'].to_list()
# Plot UMAP
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
proj_3d = umap_3d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
ufig_2d = px.scatter(proj_2d, x=0, y=1, title='UMAP Plot of Short Histology', color=hist)
ufig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hist)
ufig_2d.show()
#ufig_3d.show()
# Plot t-SNE
tsne_2d = TSNE(n_components=2, init='random', random_state=0, learning_rate='auto')
tsne_3d = TSNE(n_components=3, init='random', random_state=0, learning_rate='auto')
proj_2d = tsne_2d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
proj_3d = tsne_3d.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
tfig_2d = px.scatter(proj_2d, x=0, y=1, title='t-SNE Plot of Short Histology', color=hist)
tfig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hist)
tfig_2d.show()
#tfig_3d.show()
# Perform PCA
pca2 = PCA(n_components=2)
pca3 = PCA(n_components=3)
df_pca2 = pca2.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
df_pca3 = pca3.fit_transform(short_histology_df.loc[:,'CTRL_ANT1':])
# Display PCA plot
pca_2d = px.scatter(df_pca2, x=0, y=1, title='PCA Plot of Short Histology', color=hist)
# Display 3D PCA plot
pca_3d = px.scatter_3d(df_pca3, x=0, y=1, z=2, color=hist)
pca_2d.show()
#pca_3d.show()
# Split the dataset into features and labels
sh_X = short_histology_df.loc[:, short_histology_df.columns != 'Short_Histology'].values
sh_y = short_histology_df.loc[:, short_histology_df.columns == 'Short_Histology'].values.ravel()
# Split data into training and testing set
sh_X_train, sh_X_test, sh_y_train, sh_y_test = train_test_split(sh_X, sh_y, test_size=0.25, random_state=42)
#Sanity check
print(sh_X_train.shape, sh_X_test.shape, sh_y_train.shape, sh_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
# Class Imbalance
fig = px.histogram(short_histology_df, x='Short_Histology', title='Histology Class Balance')
fig.show()
# Initialize random forest classifier
sh_rf = RandomForestClassifier(max_depth=2, random_state=0)
# Train the random forest classifier
sh_rf.fit(sh_X_train, sh_y_train)
# Make predictions using random forest classifier
sh_rf_y_pred = sh_rf.predict(sh_X_test)
# Accuracy of model
print(f'Accuracy: {accuracy_score(sh_y_test, sh_rf_y_pred)}')
Accuracy: 0.6
# Calculate a confusion matrix
sh_cm = confusion_matrix(sh_y_test, sh_rf_y_pred, labels=sh_rf.classes_)
# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(sh_cm, text_auto=True,
labels=dict(x="True Subtype", y="Predicted Subtype", color="Productivity"),
x=short_histology_df['Short_Histology'].unique().tolist(),
y=short_histology_df['Short_Histology'].unique().tolist(),
title='Histology Classification Accuracy'
)
disp.show()
# What are the most important features?
sh_rf_feature_list = short_histology_df.columns
sh_rf_feature_list = sh_rf_feature_list.drop('Short_Histology')
sh_rf_imp_features = pd.Series(sh_rf.feature_importances_, index=sh_rf_feature_list)
sh_rf_imp_genes = sh_rf_imp_features.sort_values(ascending=False).to_frame().reset_index()
sh_rf_imp_genes.columns = ["features", "importance"]
sh_rf_imp_genes_fil = sh_rf_imp_genes[~(sh_rf_imp_genes == 0.000000).any(axis=1)]
sh_rf_imp_genes_fil
| features | importance | |
|---|---|---|
| 0 | miR-4685-3p | 0.030000 |
| 1 | miR-374b-3p | 0.030000 |
| 2 | miR-4727-5p | 0.020000 |
| 3 | miR-6722-5p | 0.020000 |
| 4 | miR-6798-5p | 0.020000 |
| ... | ... | ... |
| 94 | miR-223-5p | 0.002857 |
| 95 | miR-17-3p | 0.002857 |
| 96 | miR-181b-5p | 0.002857 |
| 97 | miR-6872-3p | 0.002857 |
| 98 | miR-6078 | 0.002857 |
99 rows × 2 columns
# Display interactive Barplot of important miRNAs
fig = px.bar(sh_rf_imp_genes_fil, x=sh_rf_imp_genes_fil.features, y=sh_rf_imp_genes_fil.importance)
fig.show()
relapse_df = main_df.drop(['Specimen_Type', 'Diagnosis', 'Short_Histology', 'Tumor_Subtype', 'Survival_Status'],axis=1)
#relapse_df
# Store relapse data
relapse = relapse_df['Relapse'].to_list()
# Plot UMAP
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])
proj_3d = umap_3d.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])
ufig_2d = px.scatter(proj_2d, x=0, y=1, title='UMAP Plot of Relapse', color=relapse)
ufig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=relapse)
ufig_2d.show()
#ufig_3d.show()
# Plot t-SNE
tsne_2d = TSNE(n_components=2, init='random', random_state=0, learning_rate='auto')
tsne_3d = TSNE(n_components=3, init='random', random_state=0, learning_rate='auto')
proj_2d = tsne_2d.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])
proj_3d = tsne_3d.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])
tfig_2d = px.scatter(proj_2d, x=0, y=1, title='t-SNE Plot of Relapse', color=relapse)
tfig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=relapse)
tfig_2d.show()
#tfig_3d.show()
# Perform PCA
pca2 = PCA(n_components=2)
pca3 = PCA(n_components=3)
df_pca2 = pca2.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])
df_pca3 = pca3.fit_transform(relapse_df.loc[:,'CTRL_ANT1':])
# Display PCA plot
pca_2d = px.scatter(df_pca2, x=0, y=1, title='PCA Plot of Relapse', color=relapse)
# Display 3D PCA plot
pca_3d = px.scatter_3d(df_pca3, x=0, y=1, z=2, color=relapse)
pca_2d.show()
#pca_3d.show()
# Split the dataset into features and labels
r_X = relapse_df.loc[:, relapse_df.columns != 'Relapse'].values
r_y = relapse_df.loc[:, relapse_df.columns == 'Relapse'].values.ravel()
# Split data into training and testing set
r_X_train, r_X_test, r_y_train, r_y_test = train_test_split(r_X, r_y, test_size=0.25, random_state=42)
#Sanity check
print(r_X_train.shape, r_X_test.shape, r_y_train.shape, r_y_test.shape)
(12, 2102) (5, 2102) (12,) (5,)
# Class Imbalance
fig = px.histogram(relapse_df, x='Relapse')
fig.show()
# Initialize random forest classifier
r_rf = RandomForestClassifier(max_depth=2, random_state=0)
# Train the random forest classifier
r_rf.fit(r_X_train, r_y_train)
# Make predictions using random forest classifier
r_rf_y_pred = r_rf.predict(r_X_test)
# Accuracy of model
print(f'Accuracy: {accuracy_score(r_y_test, r_rf_y_pred)}')
Accuracy: 0.8
# Calculate a confusion matrix
r_cm = confusion_matrix(r_y_test, r_rf_y_pred, labels=r_rf.classes_)
# Display confusion matrix to look at how accurately the ML model was able to classify each tumor type
disp = px.imshow(r_cm, text_auto=True,
labels=dict(x="True Relapse", y="Predicted Relapse", color="Productivity"),
x=relapse_df['Relapse'].unique().tolist(),
y=relapse_df['Relapse'].unique().tolist()
)
disp.show()
print(r_X_test)
print('True Relapse Label:')
print(r_y_test)
print('Predicted Relapse:')
print(r_rf_y_pred)
[[16 16 22 ... 891 75 290] [35 60 49 ... 1876 78 662] [12 68 43 ... 1020 174 370] [45 14 36 ... 8387 128 1737] [39 30 21 ... 1508 64 254]] True Relapse Label: ['Yes' 'Yes' 'No' 'Yes' 'No'] Predicted Relapse: ['Yes' 'Yes' 'No' 'Yes' 'Yes']
The ML algorithm misclassified patient 15635-101 as having a relapse, but this patient did not at the time of plasma collection. This patient should be investigated further for relapse.
# How is the Random Forest Model Learning?
from sklearn import tree
fn = relapse_df.loc[:, relapse_df.columns != 'Relapse'].columns.to_list()
cn = r_rf.classes_.tolist()
tree.plot_tree(r_rf.estimators_[0], feature_names = fn, class_names=cn, filled = True)
[Text(0.5, 0.75, 'miR-873-5p <= 1924.0\ngini = 0.278\nsamples = 6\nvalue = [2, 10]\nclass = Yes'), Text(0.25, 0.25, 'gini = 0.0\nsamples = 5\nvalue = [0, 10]\nclass = Yes'), Text(0.75, 0.25, 'gini = 0.0\nsamples = 1\nvalue = [2, 0]\nclass = No')]
# What are the most important features?
r_rf_feature_list = relapse_df.columns
r_rf_feature_list = r_rf_feature_list.drop('Relapse')
r_rf_imp_features = pd.Series(r_rf.feature_importances_, index=r_rf_feature_list)
r_rf_imp_genes = r_rf_imp_features.sort_values(ascending=False).to_frame().reset_index()
r_rf_imp_genes.columns = ["features", "importance"]
r_rf_imp_genes_fil = r_rf_imp_genes[~(r_rf_imp_genes == 0.000000).any(axis=1)]
r_rf_imp_genes_fil
| features | importance | |
|---|---|---|
| 0 | miR-6506-3p | 0.030303 |
| 1 | miR-502-5p | 0.020202 |
| 2 | miR-4677-5p | 0.020202 |
| 3 | miR-19a-5p | 0.020202 |
| 4 | miR-887-5p | 0.020202 |
| ... | ... | ... |
| 91 | miR-7106-3p | 0.006734 |
| 92 | miR-1976 | 0.006734 |
| 93 | miR-7157-3p | 0.003367 |
| 94 | miR-4636 | 0.003367 |
| 95 | miR-505-3p | 0.002886 |
96 rows × 2 columns
# Display interactive Barplot of important miRNAs
fig = px.bar(r_rf_imp_genes_fil, x=r_rf_imp_genes_fil.features, y=r_rf_imp_genes_fil.importance)
fig.show()
# Create a dataframe containing only ML selected genes
ml_hist_df = short_histology_df.filter(sh_rf_imp_genes_fil['features'].to_list()[0:1])
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_hist_df)
proj_3d = umap_3d.fit_transform(ml_hist_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=hist, title='UMAP Plot using genes found by Random Forest Algorithm')
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hist)
fig_2d.show()
fig_3d.show()
sh_rf_imp_genes_fil['features'].to_list()[0:1]
['miR-4685-3p']
# Create a dataframe containing only ML selected genes
ml_relapse_df = relapse_df.filter(r_rf_imp_genes_fil['features'].to_list())
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_relapse_df)
proj_3d = umap_3d.fit_transform(ml_relapse_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=relapse, title='UMAP Plot for using genes found by Random Forest Algorithm (Relapse)')
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=relapse)
fig_2d.show()
fig_3d.show()
# Create a dataframe containing only ML selected genes
ml_relapse_df = relapse_df.filter(r_rf_imp_genes_fil['features'].to_list()[0:1]) # Also [0:10]
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(ml_relapse_df)
proj_3d = umap_3d.fit_transform(ml_relapse_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=relapse, title='UMAP Plot using most important gene found by Random Forest Algorithm (Relapse)')
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=relapse)
fig_2d.show()
fig_3d.show()
# The most important gene determined by ML algorithm
r_rf_imp_genes_fil['features'].to_list()[0:1]
['miR-6506-3p']
Interestingly, patient 15635-101 is being clustered with the relapsed patients based on just miRNA signature.
# Add "hsa-" to all miRNA genes
hist_genes = sh_rf_imp_genes_fil['features'].to_list()
relapse_genes = r_rf_imp_genes_fil['features'].to_list()
hist_genes = ["hsa-" + mirnas for mirnas in hist_genes]
relapse_genes = ["hsa-" + mirnas for mirnas in relapse_genes]
# Export miRNA genes important for distinguishing LGG and HGG brain tumors
with open("LGG_miRNAs.txt", 'w') as file:
for row in hist_genes:
s = "".join(map(str, row))
file.write(s+'\n')
# Export miRNA genes important for relapse in LGG/HGG brain tumors
with open("LGG_relapse_miRNAs.txt", 'w') as file:
for row in relapse_genes:
s = "".join(map(str, row))
file.write(s+'\n')
# relapse_cols = relapse_df.columns
# relapse_cols
# cols = list(set(relapse_cols) & set(dip_test_genes))
# cols
# print(len(dip_test_genes))
# print(len(cols))
# list(set(cols) ^ set(dip_test_genes))
dip_test_genes = ['miR-339-5p', 'miR-1299', 'let-7g-5p', 'miR-145-3p', 'miR-6752-5p', 'miR-708-5p',
'miR-3679-3p','miR-448', 'miR-548ag', 'miR-4796-5p', 'miR-6844', 'miR-5697', 'miR-4765',
'miR-8076', 'miR-1278', 'miR-3606-5p', 'miR-4424', 'miR-2114-3p', 'miR-4677-5p', 'miR-644a',
'miR-6739-3p']
# Create a dataframe containing only dip test genes
dip_relapse_df = relapse_df[dip_test_genes]
# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(dip_relapse_df)
proj_3d = umap_3d.fit_transform(dip_relapse_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=relapse, title='UMAP Plot using Dip Test genes (Relapse)')
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=relapse)
fig_2d.show()
fig_3d.show()
# Create a dataframe containing only dip test genes
dip_histology_df = short_histology_df[dip_test_genes]
# Plot the UMAP Figures
umap_2d = UMAP(n_components=2, init='random', random_state=0)
umap_3d = UMAP(n_components=3, init='random', random_state=0)
proj_2d = umap_2d.fit_transform(dip_histology_df)
proj_3d = umap_3d.fit_transform(dip_histology_df)
fig_2d = px.scatter(proj_2d, x=0, y=1, color=hist, title='UMAP Plot using Dip Test genes (Histology)')
fig_3d = px.scatter_3d(proj_3d, x=0, y=1, z=2, color=hist)
fig_2d.show()
fig_3d.show()